# Reading the pre-prepared dataset
all <- read_excel("che_covid19.xlsx")[,2:40] %>% relocate(mortality, .before = `Country Code`)
all
# There are some extra variables, but we will not use all of them
to_use <- c("mortality", "che", "beds", "pop65", "popdens", "urban", "dphe", "dghe", "tobacco", "procur", "doctors", "nurses", "beh_stayhome", "beh_socgathering", "beh_distance", "beh_tellsymp", "beh_handwash", "fob_social", "fob_handshake", "fob_stores", "fob_curfew", "perceivedreaction_d", "govtrust_d", "govfact_d", "perceivedeffectiveness_d", "region", "population")
# Selecting columns we plan to use
data <- subset(all, select=to_use)
data
# Alleviating the problems in the next function caused by "_" in column names
colnames(data) <- gsub("_", ".", colnames(data))
# Calculating descriptive statistics
number <- function(x, na.rm = TRUE){return(sum(!is.na(x)))}
stats <- data %>%
summarise(across(where(is.numeric),
list(mean = mean, median = median, sd = sd, min = min, max = max, Q1=~quantile(., probs = 0.25), Q3=~quantile(., probs = 0.75)),
na.rm = TRUE)) %>%
pivot_longer(everything(), names_to = "name", values_to = "value") %>%
separate(name, c("variable", "statistic"), sep = "_") %>%
pivot_wider(names_from = statistic, values_from = value) %>%
arrange(variable) %>%
select(variable, mean, sd, min, Q1, median, Q3, max)
# Changing column names back
colnames(data) <- gsub("\\.", "_", colnames(data))
data
# Changing variable names to match
stats$variable <- gsub("\\.", "_", stats$variable)
options(scipen=999) # Disabling scientific notation (e.g., e+01)
# Putting variables in the original order
stats = na.omit(stats[match(to_use, stats$variable),])
stats
round_df <- function(x, digits) {
# round all numeric variables
# x: data frame
# digits: number of digits to round
numeric_columns <- sapply(x, mode) == 'numeric'
x[numeric_columns] <- round(x[numeric_columns], digits)
x
}
# Rounding the numbers
round_df(stats, 3)
| variable | mean | sd | min | Q1 | median | Q3 | max |
|---|---|---|---|---|---|---|---|
| mortality | 134.616 | 106.706 | 0.390 | 50.638 | 125.660 | 204.732 | 605.680 |
| che | 7.623 | 2.580 | 2.495 | 5.689 | 7.540 | 9.260 | 16.885 |
| beds | 3.871 | 2.506 | 0.630 | 2.203 | 3.120 | 5.128 | 13.050 |
| pop65 | 14.778 | 5.963 | 1.523 | 9.130 | 15.593 | 19.762 | 28.002 |
| popdens | 296.441 | 1043.790 | 3.576 | 34.626 | 99.851 | 218.329 | 8044.526 |
| urban | 75.385 | 16.109 | 18.585 | 66.545 | 79.732 | 87.029 | 100.000 |
| dphe | 35.833 | 14.427 | 13.667 | 25.914 | 34.440 | 48.149 | 70.604 |
| dghe | 63.963 | 14.562 | 28.732 | 51.746 | 64.810 | 74.086 | 85.323 |
| tobacco | 24.247 | 8.766 | 7.900 | 18.325 | 23.500 | 28.875 | 44.700 |
| procur | 0.517 | 0.504 | 0.000 | 0.000 | 1.000 | 1.000 | 1.000 |
| doctors | 33.812 | 16.049 | 4.650 | 23.620 | 32.370 | 43.598 | 80.130 |
| nurses | 69.248 | 50.026 | 2.800 | 26.465 | 61.645 | 102.375 | 216.700 |
| beh_stayhome | 83.462 | 7.333 | 58.678 | 81.242 | 84.828 | 87.905 | 94.411 |
| beh_socgathering | 92.326 | 4.991 | 75.297 | 90.952 | 94.353 | 95.561 | 99.000 |
| beh_distance | 78.763 | 9.116 | 47.866 | 74.073 | 81.322 | 85.271 | 90.561 |
| beh_tellsymp | 92.846 | 4.281 | 78.447 | 92.823 | 94.256 | 95.094 | 97.724 |
| beh_handwash | 91.686 | 2.830 | 83.720 | 90.474 | 92.055 | 93.745 | 96.566 |
| fob_social | 0.976 | 0.038 | 0.786 | 0.976 | 0.989 | 0.994 | 1.000 |
| fob_handshake | 0.967 | 0.037 | 0.745 | 0.959 | 0.977 | 0.986 | 1.000 |
| fob_stores | 0.806 | 0.167 | 0.197 | 0.768 | 0.870 | 0.910 | 0.971 |
| fob_curfew | 0.714 | 0.194 | 0.159 | 0.592 | 0.741 | 0.875 | 0.990 |
| perceivedreaction_d | 0.403 | 0.233 | 0.000 | 0.232 | 0.357 | 0.565 | 0.908 |
| govtrust_d | 0.575 | 0.245 | 0.090 | 0.376 | 0.584 | 0.799 | 0.957 |
| govfact_d | 0.633 | 0.239 | 0.092 | 0.491 | 0.709 | 0.823 | 0.979 |
| perceivedeffectiveness_d | 0.888 | 0.054 | 0.696 | 0.854 | 0.900 | 0.926 | 0.966 |
| population | 64586735.900 | 187687848.304 | 360563.000 | 5427584.250 | 10730269.500 | 47935001.500 | 1397715000.000 |
# Computing the correlation matrix for the numeric variables (all except region)
CorMatrix = cor(data[, !names(data) %in% c("region")] , use = "complete.obs")
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
# Matrix of p-values (H0: correlation = 0)
p.mat <- cor.mtest(data[, !names(data) %in% c("region")])
CorMatrix<-round(CorMatrix,2)
col <- colorRampPalette(c("deeppink", "hotpink", "lightpink", "floralwhite", "darkseagreen1", "darkslategray2", "dodgerblue4"))
#png(file="corr.png", res=300, width=4500, height=4500)
corrplot(CorMatrix, method="color", col=col(200),
type="upper", order="original",
addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt=45, # Text label color and rotation
# Combine with significance
p.mat = p.mat, sig.level = c(0.01, 0.05, 0.1), insig ="label_sig",
# Hide correlation coefficient on the principal diagonal
diag=FALSE , number.cex=0.9, tl.cex=1, pch.cex=1)
Correlogram
# Exploring region
data %>% ggplot(aes(y = mortality, x = region)) +
geom_boxplot()
# Drawing a map
theme_set(theme_bw())
thismap = map_data("world")
all$`Country Name` <- recode(all$`Country Name`, "'Egypt, Arab Rep.' = 'Egypt'; 'United Kingdom' = 'UK'; 'Korea, Rep.' = 'South Korea'; 'Russian Federation' = 'Russia'; 'Slovak Republic' = 'Slovakia'; 'United States' = 'USA'")
# Setting colors
thismap <- mutate(thismap, fill = ifelse(region %in% all$`Country Name`[all$region == 'Americas'], "#FF7F11", ifelse(region %in% all$`Country Name`[all$region == 'Europe'], "#1446A0", ifelse(region %in% all$`Country Name`[all$region == 'Western Pacific'], "#DB3069", ifelse(region %in% all$`Country Name`[all$region == 'South-East Asia'], "#00AF54", ifelse(region %in% all$`Country Name`[all$region == 'Eastern Mediterranean'], "#F5D547", "white"))))))
# Using scale_fill_identity to set correct colors
#png(file="map.png", res=300, width=4500, height=3000)
ggplot(thismap, aes(long, lat, fill = fill, group=group)) +
geom_polygon(colour="gray") +
scale_fill_identity("WHO Region", guide = "legend", labels = c("South-East Asia", "Europe", "Western Pacific", "Eastern Mediterranean", "Americas", "unavailable")) +
theme(legend.position = "bottom", legend.key.size = unit(1,"cm"), legend.title=element_text(size=30),
legend.text=element_text(size=25))
World Map
formattable(data %>% group_by(region) %>%
summarise(no_rows = length(region)))
| region | no_rows |
|---|---|
| Americas | 12 |
| Eastern Mediterranean | 7 |
| Europe | 33 |
| South-East Asia | 2 |
| Western Pacific | 6 |
# There is too few instances of the three regions, so it makes sense to make an "other" category.
data = data %>% mutate(region = case_when(data$region=="Americas" ~ "Americas", data$region=="Europe" ~ "Europe", TRUE ~ "Other"))
# Exploring procur
data %>% ggplot(aes(y = mortality, x = as.factor(procur))) +
geom_boxplot() + scale_x_discrete(name = "procur")
# Drawing scatterplots of everything with mortality
#png(file="scatter.png", res=300, width=6000, height=8000)
data %>% dplyr::select(c(1:25, 27)) %>%
gather(-mortality, key = "var", value = "value") %>%
ggplot(aes(x = value, y = mortality)) +
geom_point() +
facet_wrap(~ var, ncol=3, scales = "free", shrink=TRUE) +
theme_bw() +
theme(axis.text = element_text(size = 14),
axis.title = element_text( size = 16, face = "bold" ),
legend.position="none",
strip.text = element_text(size = 20))
Scatterplots
# Doing the same things again with the newly rebuilt dataset
allnew <- read_excel("che_covid19-new.xlsx")[,2:37] %>% relocate(mortality, .before = `Country Code`)
# Now when we select countries with no less than 20 respondents in the Global Behaviors and Perceptions survey, there are 96 complete observations
allnew <- filter(allnew, n >= 20)
allnew
# There are some extra variables, but we will not use all of them
to_use1 <- c("mortality", "che", "pop65", "popdens", "urban", "dphe", "dghe", "doctors", "nurses", "beh_stayhome", "beh_socgathering", "beh_distance", "beh_tellsymp", "beh_handwash", "fob_social", "fob_handshake", "fob_stores", "fob_curfew", "perceivedreaction_d", "govtrust_d", "govfact_d", "perceivedeffectiveness_d", "region", "population")
# Selecting columns we plan to use
datanew <- subset(allnew, select=to_use1)
datanew
# Alleviating the problems in the next function caused by "_" in column names
colnames(datanew) <- gsub("_", ".", colnames(datanew))
# Calculating descriptive statistics
number <- function(x, na.rm = TRUE){return(sum(!is.na(x)))}
statsnew <- datanew %>%
summarise(across(where(is.numeric),
list(mean = mean, median = median, sd = sd, min = min, max = max, Q1=~quantile(., probs = 0.25), Q3=~quantile(., probs = 0.75)),
na.rm = TRUE)) %>%
pivot_longer(everything(), names_to = "name", values_to = "value") %>%
separate(name, c("variable", "statistic"), sep = "_") %>%
pivot_wider(names_from = statistic, values_from = value) %>%
arrange(variable) %>%
select(variable, mean, sd, min, Q1, median, Q3, max)
# Changing column names back
colnames(datanew) <- gsub("\\.", "_", colnames(datanew))
datanew
# Changing variable names to match
statsnew$variable <- gsub("\\.", "_", statsnew$variable)
options(scipen=999) # Disabling scientific notation (e.g., e+01)
# Putting variables in the original order
statsnew = na.omit(statsnew[match(to_use1, statsnew$variable),])
statsnew
round_df <- function(x, digits) {
# round all numeric variables
# x: data frame
# digits: number of digits to round
numeric_columns <- sapply(x, mode) == 'numeric'
x[numeric_columns] <- round(x[numeric_columns], digits)
x
}
# Rounding the numbers
round_df(statsnew, 3)
| variable | mean | sd | min | Q1 | median | Q3 | max |
|---|---|---|---|---|---|---|---|
| mortality | 118.112 | 101.068 | 0.390 | 30.935 | 105.145 | 184.107 | 605.680 |
| che | 7.005 | 2.522 | 2.343 | 5.280 | 6.878 | 8.663 | 16.885 |
| pop65 | 12.195 | 6.668 | 1.157 | 6.429 | 12.111 | 18.753 | 28.002 |
| popdens | 268.424 | 856.981 | 3.298 | 46.360 | 100.047 | 220.683 | 8044.526 |
| urban | 68.466 | 20.491 | 17.313 | 56.996 | 71.190 | 83.755 | 100.000 |
| dphe | 39.795 | 16.242 | 11.957 | 26.599 | 39.478 | 49.723 | 77.270 |
| dghe | 57.860 | 18.333 | 14.867 | 45.247 | 59.586 | 73.157 | 88.043 |
| doctors | 27.952 | 17.935 | 0.600 | 12.657 | 26.290 | 40.515 | 80.130 |
| nurses | 58.149 | 46.944 | 2.800 | 19.030 | 51.520 | 74.270 | 216.700 |
| beh_stayhome | 82.679 | 8.712 | 48.359 | 79.057 | 84.243 | 88.498 | 95.720 |
| beh_socgathering | 91.259 | 5.792 | 70.304 | 88.162 | 93.192 | 95.545 | 99.300 |
| beh_distance | 76.325 | 9.914 | 47.866 | 69.791 | 77.600 | 84.310 | 92.067 |
| beh_tellsymp | 92.308 | 4.673 | 78.447 | 90.600 | 93.771 | 95.013 | 99.289 |
| beh_handwash | 91.494 | 3.286 | 80.600 | 90.182 | 91.975 | 93.745 | 97.921 |
| fob_social | 0.978 | 0.033 | 0.786 | 0.977 | 0.989 | 0.995 | 1.000 |
| fob_handshake | 0.965 | 0.045 | 0.661 | 0.956 | 0.975 | 0.985 | 1.000 |
| fob_stores | 0.814 | 0.144 | 0.197 | 0.775 | 0.858 | 0.901 | 0.971 |
| fob_curfew | 0.747 | 0.179 | 0.159 | 0.645 | 0.776 | 0.892 | 1.000 |
| perceivedreaction_d | 0.403 | 0.245 | 0.000 | 0.215 | 0.368 | 0.565 | 0.952 |
| govtrust_d | 0.540 | 0.251 | 0.040 | 0.370 | 0.524 | 0.771 | 0.957 |
| govfact_d | 0.600 | 0.244 | 0.092 | 0.400 | 0.653 | 0.803 | 0.979 |
| perceivedeffectiveness_d | 0.873 | 0.070 | 0.619 | 0.837 | 0.890 | 0.922 | 1.000 |
| population | 69540265.083 | 201817550.622 | 360563.000 | 5658078.250 | 12160829.500 | 53931840.500 | 1397715000.000 |
# Computing the correlation matrix for the numeric variables (all except region)
CorMatrix1 = cor(datanew[, !names(datanew) %in% c("region")] , use = "complete.obs")
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
# Matrix of p-values (H0: correlation = 0)
p.mat1 <- cor.mtest(datanew[, !names(datanew) %in% c("region")])
CorMatrix1<-round(CorMatrix1,2)
col <- colorRampPalette(c("deeppink", "hotpink", "lightpink", "floralwhite", "darkseagreen1", "darkslategray2", "dodgerblue4"))
#png(file="newcorr.png", res=300, width=4500, height=4500)
corrplot(CorMatrix1, method="color", col=col(200),
type="upper", order="original",
addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt=45, # Text label color and rotation
# Combine with significance
p.mat = p.mat1, sig.level = c(0.01, 0.05, 0.1), insig ="label_sig",
# Hide correlation coefficient on the principal diagonal
diag=FALSE , number.cex=0.9, tl.cex=1, pch.cex=1)
# Заново каждый раз, когда library(corrplot)
# trace(corrplot, edit=TRUE)
# заменить на 443 строке
# place_points = function(sig.locs, point) {
#text(pos.pNew[, 1][sig.locs], pos.pNew[, 2][sig.locs],
# labels = point, col = pch.col, cex = pch.cex,
#lwd = 2)
# НА
#place_points = function(sig.locs, point) {
#text(pos.pNew[, 1][sig.locs], (pos.pNew[, 2][sig.locs])+0.25,
#labels = point, col = pch.col, cex = pch.cex,
#lwd = 2)
New Correlogram
formattable(datanew %>% group_by(region) %>%
summarise(no_rows = length(region)))
| region | no_rows |
|---|---|
| Africa | 7 |
| Americas | 18 |
| Eastern Mediterranean | 13 |
| Europe | 43 |
| South-East Asia | 6 |
| Western Pacific | 9 |
# Exploring region
datanew %>% ggplot(aes(y = mortality, x = region)) +
geom_boxplot() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Drawing scatterplots of everything with mortality
#png(file="newscatter.png", res=300, width=6000, height=8000)
datanew %>% dplyr::select(c(1:22, 24)) %>%
gather(-mortality, key = "var", value = "value") %>%
ggplot(aes(x = value, y = mortality)) +
geom_point() +
facet_wrap(~ var, ncol=3, scales = "free", shrink=TRUE) +
theme_bw() +
theme(axis.text = element_text(size = 14),
axis.title = element_text( size = 16, face = "bold" ),
legend.position="none",
strip.text = element_text(size = 20))
New Scatterplots
# Drawing a map
theme_set(theme_bw())
thismap1 = map_data("world")
allnew
allnew$`Country Name` <- recode(allnew$`Country Name`, "'Egypt, Arab Rep.' = 'Egypt'; 'United Kingdom' = 'UK'; 'Korea, Rep.' = 'South Korea'; 'Russian Federation' = 'Russia'; 'Slovak Republic' = 'Slovakia'; 'United States' = 'USA'; 'Iran, Islamic Rep.' = 'Iran'")
# Setting colors
thismap1 <- mutate(thismap1, fill = ifelse(region %in% allnew$`Country Name`[allnew$region == 'Americas'], "#FF7F11", ifelse(region %in% allnew$`Country Name`[allnew$region == 'Europe'], "#1446A0", ifelse(region %in% allnew$`Country Name`[allnew$region == 'Western Pacific'], "#DB3069", ifelse(region %in% allnew$`Country Name`[allnew$region == 'South-East Asia'], "#00AF54", ifelse(region %in% allnew$`Country Name`[allnew$region == 'Eastern Mediterranean'], "#F5D547", "white"))))))
# Using scale_fill_identity to set correct colors
#png(file="newmap.png", res=300, width=4500, height=3000)
ggplot(thismap1, aes(long, lat, fill = fill, group=group)) +
geom_polygon(colour="gray") +
scale_fill_identity("WHO Region", guide = "legend", labels = c("South-East Asia", "Europe", "Western Pacific", "Eastern Mediterranean", "Americas", "unavailable")) +
theme(legend.position = "bottom", legend.key.size = unit(1,"cm"), legend.title=element_text(size=30),
legend.text=element_text(size=25))
New World Map
# Drawing a scatterplot over a limited range of density for better visibility
datanew %>%
ggplot(aes(popdens, mortality)) +
geom_jitter(width = 0.25, alpha = 0.5) + xlim(0, 1000)
# Drawing a scatterplot over a limited range of population for better visibility
datanew %>%
ggplot(aes(population, mortality)) +
geom_jitter(width = 0.25, alpha = 0.5) +
scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6), limits = c(0, 200000000), name = "population (mln.)")